notebook.community



In [ ]:

    
from __future__ import division
import codecs
import pickle
import networkx as nx
from collections import Counter

rcParams['figure.figsize'] = (12.0, 10.0)
rcParams['font.family'] = 'Times New Roman'



In [ ]:

    
from os.path import abspath
workspace = "/".join(abspath('.').split('/')[:-1])

Note: Make sure that your workspace sees the root directory of openie_eval.



In [ ]:

    
from openie_eval.openie_eval import semantic_parsing as sp
from openie_eval.openie_eval import ontologization
reload(sp)
reload(ontologization)

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()



In [ ]:

    
keyword = 'carnatic_music'

wiki_entities = codecs.open(workspace + '/data/ground-truth/'+keyword+'_pages.txt', encoding='utf-8').readlines()
wiki_entities = [i.strip().lower() for i in wiki_entities]

methods = ['reverb', 'openie', 'semantic-parsing']
labels = {'reverb': 'ReVerb', 'openie': 'OpenIE 4.0', 'semantic-parsing': 'Sem. Parsing'}
colors = ['#990033', '#006600', '#330066']

#coref_suffix = ''
coref_suffix = '-coref'

#filtered_suffix = ''
filtered_suffix = '-filtered'

Entity identification



In [ ]:

    
print len(wiki_entities)
for method in methods:
    relations = pickle.load(file(workspace + '/data/'+method+'/'+keyword+'/relations'+coref_suffix+filtered_suffix+'.pickle'))
    relations = [[i['arg1'].lower(), lemmatizer.lemmatize(i['rel'].lower(), pos='v'), i['arg2'].lower()] for i in relations]
    candidate_entities = [i[0] for i in relations]
    overlap = set(candidate_entities).intersection(wiki_entities)
    residual = set(candidate_entities)-set(wiki_entities)
    print method, len(overlap), len(residual), round(len(overlap)/len(wiki_entities), 2), round(len(residual)/len(set(candidate_entities)), 2)

Rule-based

Create rules



In [ ]:

    
#carnatic
class_terms = {}
class_terms['carnatic_ragas'] = ['raga', 'raaga', 'scale']
class_terms['carnatic_singers'] = ['vocalist', 'singer']
class_terms['carnatic_composers'] = ['composer', 'poet']
class_terms['carnatic_instrumentalists'] = ['instrumentalist', 'player', 'violonist']
class_terms['carnatic_compositions'] = ['composition', 'song']
class_terms['carnatic_musicians'] = list(concatenate([class_terms[i] for i in ['carnatic_singers', 'carnatic_composers', 'carnatic_instrumentalists']]))
class_terms['carnatic_musicians'].append('artist')

out_file = workspace + '/data/results/qualitative/entity-identification/rule-based/carnatic_music/rules.pickle'
pickle.dump(class_terms, file(out_file, 'w'))



In [ ]:

    
#hindustani
class_terms = {}
class_terms['hindustani_ragas'] = ['raga', 'raaga', 'raag', 'rag', 'scale', u'rāga']
class_terms['hindustani_singers'] = ['vocalist', 'singer']
class_terms['hindustani_composers'] = ['composer', 'poet']
class_terms['hindustani_instrumentalists'] = ['instrumentalist', 'player', 'violonist']
#class_terms['carnatic_compositions'] = ['composition', 'song']
class_terms['hindustani_musicians'] = list(concatenate([class_terms[i] for i in ['hindustani_singers', 'hindustani_composers', 'hindustani_instrumentalists']]))
class_terms['hindustani_musicians'].append('artist')

out_file = workspace + '/data/results/qualitative/entity-identification/rule-based/hindustani_music/rules.pickle'
pickle.dump(class_terms, file(out_file, 'w'))

Class assignment



In [ ]:

    
keyword = 'hindustani_music'

coverage = {}
labelled_class_instances = {}

rules = pickle.load(file(workspace + '/data/results/qualitative/entity-identification/rule-based/'+keyword+'/rules.pickle'))
groundtruth = ontologization.load_groundtruth(keyword, rules.keys())
class_terms = pickle.load(file(workspace + '/data/results/qualitative/entity-identification/rule-based/'+keyword+'/rules.pickle'))

for method in methods:
    relations = pickle.load(file(workspace + '/data/'+method+'/'+keyword+'/relations'+coref_suffix+filtered_suffix+'.pickle'))
    relations = [[i['arg1'].lower(), lemmatizer.lemmatize(i['rel'].lower(), pos='v'), i['arg2'].lower()] for i in relations]
    
    class_instances = ontologization.class_instances_by_rules(relations, rules)
    res = ontologization.analyze_coverage(class_instances, groundtruth)
    coverage[method] = res['coverage']
    labelled_class_instances[method] = res['labelled_class_instances']



In [ ]:

    
def label_numbers(rects, numbers):
    # attach some text labels
    for i in xrange(len(rects)):
        rect = rects[i]
        text_label = str(numbers[i])
        if text_label == '0':
            continue
        height = rect.get_height()
        ax.text(rect.get_x()+rect.get_width()/2., height-0.03, '%s'%(text_label),
                fontsize=22, ha='center', va='bottom', color='w')



In [ ]:

    
rcParams['figure.figsize'] = (12.0, 10.0)

fig, ax = plt.subplots()

bar_width = 0.2
index = arange(len(class_terms))

count = 0
all_fp_ratios = []

for method in methods:
    overlap_scores = [i[0] for i in coverage[method]]
    rects = bar(index, overlap_scores, width=bar_width, color=colors[count], label=labels[method])
    label_numbers(rects, [len(labelled_class_instances[method][i]['tp']) for i in class_terms.keys()])
    fp_ratios = [i[1] for i in coverage[method]]
    all_fp_ratios.extend(zip(index+bar_width/2.0, fp_ratios))
    index = index+bar_width
    count += 1
    
all_fp_ratios = array(sorted(all_fp_ratios, key=lambda x:x[0]))
stem(all_fp_ratios[:, 0], all_fp_ratios[:, 1], linefmt='k--', markerfmt='ko')

fontsize=30
xlabel('Concepts', fontsize=fontsize+2)
ylabel('Overlap ($O$) with reference data', fontsize=fontsize+2)
if keyword == 'carnatic_music':
    xticks(index-1.5*bar_width, [i[9:] for i in class_terms.keys()])
else:
    xticks(index-1.5*bar_width, [i[11:] for i in class_terms.keys()])
legend(prop={'size': fontsize}, loc='upper left', 
       fancybox=True)

xticks(fontsize=fontsize, rotation=14)
yticks(fontsize=fontsize)



In [ ]:

    
ylim(0, 0.74)



In [ ]:

    
fname = workspace + '/data/results/qualitative/entity-identification/rule-based/'+keyword+'/class-agreement-with-wikipedia'
savefig(fname+'.pdf', dpi=200, facecolor='w', edgecolor='w', orientation='landscape', 
        papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1)
savefig(fname+'.png', dpi=200, facecolor='w', edgecolor='w', orientation='landscape', 
        papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1)



In [ ]:

    
close('all')



In [ ]:

    
agreement_scores = ontologization.compute_agreement(labelled_class_instances, methods)



In [ ]:

    
rcParams['figure.figsize'] = (12.0, 10.0)
inter_labels = {'reverb-openie': 'ReVerb-OpenIE 4.0', 'openie-semantic-parsing': 'OpenIE 4.0-Sem. Parsing', 
                'reverb-semantic-parsing': 'Sem. Parsing-ReVerb'}

fig, ax = plt.subplots()

bar_width = 0.2
index = arange(len(class_terms))
count = 0
for method, res in agreement_scores.items():
    scores = [i[0] for i in res]
    abs_numbers = [len(i[1]) for i in res]
    
    rects = bar(index, scores, bar_width, color=colors[count], label=inter_labels[method])
    label_numbers(rects, abs_numbers)
    
    index = index+bar_width
    count += 1

fontsize=30
xlabel('Concepts', fontsize=fontsize+2)
ylabel('Inter-system agreement over $R$', fontsize=fontsize+2)
if keyword == 'carnatic_music':
    xticks(index-1.5*bar_width, [i[9:] for i in class_terms.keys()])
else:
    xticks(index-1.5*bar_width, [i[11:] for i in class_terms.keys()])
legend(prop={'size': fontsize}, loc='upper center', 
       bbox_to_anchor=(0.5, 1.2), fancybox=True)

xticks(fontsize=fontsize, rotation=10)
yticks(fontsize=fontsize)



In [ ]:

    
ylim(0, 1.05)



In [ ]:

    
fname = workspace + '/data/results/qualitative/entity-identification/rule-based/'+keyword+'/class-agreement-inter-method'
savefig(fname+'.pdf', dpi=200, facecolor='w', edgecolor='w', orientation='landscape', 
        papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1)
savefig(fname+'.png', dpi=200, facecolor='w', edgecolor='w', orientation='landscape', 
        papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1)

Bootstrapping

Distance measure: Cosine similarity between split-object vectors of seedset and the given entity. For seedset, we consider those split objects which occur more than once.

Iterate over the set and select the nearest to the seedset
Merge it with seedset and recompute it's split-object vector
Re-iterate

Variables to play with:

Seedset size
Iterations



In [ ]:

    
from random import shuffle
reload(ontologization)

#NOTE: Run rule-based before running this, it uses class_instances and the groundtruth from that!!



In [ ]:

    
def get_seedset(class_instances, n=3):
    seedset = {}
    for class_type, instances in class_instances.items():
        shuffle(instances)
        seedset[class_type] = instances[:n]
    return seedset



In [ ]:

    
coverage = {}
iteration_step = 5

for method in methods:
    coverage[method] = {}
    
    relations = pickle.load(file(workspace + '/data/'+method+'/'+keyword+'/relations'+coref_suffix+filtered_suffix+'.pickle'))
    relations = [[i['arg1'].lower(), lemmatizer.lemmatize(i['rel'].lower(), pos='v'), i['arg2'].lower()] for i in relations]
        
    predicates = ontologization.get_predicates(relations, normalization=False)
    objects = ontologization.get_objects(relations, split=True, normalization=True)
    class_instances = ontologization.class_instances_by_rules(relations, rules)
    
    n_seedsets = 5
    for n_seedset in xrange(n_seedsets): 
        seedset = get_seedset(class_instances, 3)
        
        total_iterations = 0
        
        for class_type in seedset.keys():
            if class_type not in coverage[method].keys():
                coverage[method][class_type] = []
                
            bootstrap_iterator = ontologization.bootstrap_lsa(seedset[class_type], objects, predicates, 
                                               expansion=1, iterations=len(groundtruth[class_type]), yield_step=iteration_step)
            iter_count = 1
            while True:
                try:
                    res = bootstrap_iterator.next()
                    overlap_score = ontologization.overlap(res, groundtruth[class_type])
                    fp_ratio = len(set(res)-set(groundtruth[class_type]))/len(res)
                    if len(coverage[method][class_type]) <= iter_count:
                        coverage[method][class_type].append([overlap_score, fp_ratio])
                    else:
                        coverage[method][class_type][iter_count-1][0] += overlap_score
                        coverage[method][class_type][iter_count-1][0] /= 2.0
                        coverage[method][class_type][iter_count-1][1] += fp_ratio
                        coverage[method][class_type][iter_count-1][1] /= 2.0
                    iter_count += 1
                except StopIteration:
                    break



In [ ]:

    
coverage



In [ ]:

    
import itertools
def flip(items, ncol):
    return itertools.chain(*[items[i::ncol] for i in range(ncol)])



In [ ]:

    
rcParams['figure.figsize'] = (12.0, 10.0)
styles = ['-', '--']
for class_type in seedset.keys():
    fig = figure()
    ax = fig.add_subplot(1,1,1)
    
    count = 0
    for method in methods:
        y1 = [i[0] for i in coverage[method][class_type]]
        y2 = [i[1] for i in coverage[method][class_type]]
        x = arange(1, len(y1)+1)*iteration_step
        plot(x, y1, styles[0], color=colors[count], label=labels[method], linewidth=2.5)
        plot(x, y2, styles[1], color=colors[count], linewidth=2.5)
        count += 1
    
    fontsize=30
    xlabel('No. of entities bootstrapped', fontsize=fontsize+2)
    #ylabel('Num. of instances bootstrapped', fontsize=fontsize+2)

    #Get artists and labels for legend
    handles, _labels = ax.get_legend_handles_labels()
    
    #Create custom artists
    custom_artists = []
    custom_artists.append(plt.Line2D((0,1),(0,0), color='k', linestyle='-'))
    custom_artists.append(plt.Line2D((0,1),(0,0), color='k', linestyle='--'))
    
    ax.legend(flip(handles+custom_artists, 3),
              flip(_labels + ['Overlap ($O$)', 'Residual ($R$)'], 3),
              ncol=3, prop={'size': fontsize-6},
              loc='upper center', bbox_to_anchor=(0.5, 1.1),
              fancybox=True)
    
    xticks(fontsize=fontsize)
    yticks(fontsize=fontsize)
    ylim_down, ylim_up = ax.get_ylim()
    ylim(ylim_down, ylim_up*1.05)
    xlim_down, xlim_up = ax.get_xlim()
    xlim(xlim_down, xlim_up*0.95)
    
    grid(True)
    xgridlines = getp(gca(), 'xgridlines')
    ygridlines = getp(gca(), 'ygridlines')
    setp(xgridlines, 'color', '0.6')
    setp(ygridlines, 'color', '0.6')
    
    fname = workspace + '/data/results/qualitative/entity-identification/bootstrapping/'+keyword+'/'+class_type
    savefig(fname+'.pdf', dpi=200, facecolor='w', edgecolor='w', orientation='landscape', 
            papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1)
    savefig(fname+'.png', dpi=200, facecolor='w', edgecolor='w', orientation='landscape', 
            papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1)
    close()



In [ ]:

    
close('all')